In [1]:
import ucdeconvolve as ucd
import scanpy as sc
from spatialdata_io import xenium
# old numpy 1.26.4 in python_edger ; 2.2.4 pip
import spatialdata as sd
/home/s4716765/.conda/envs/edger/lib/python3.10/site-packages/dask/dataframe/__init__.py:31: FutureWarning: The legacy Dask DataFrame implementation is deprecated and will be removed in a future version. Set the configuration option `dataframe.query-planning` to `True` or None to enable the new Dask Dataframe implementation and silence this warning.
  warnings.warn(
In [2]:
import spatialdata as sd
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Polygon
import numpy as np
import pandas as pd
import os
from spatialdata_io import xenium


def plot_xenium_spatial(
    xenium_sdata,
    x_region,
    y_region,
    gene1_name,
    gene2_name,
    cell_type_colors,
    gene1_threshold=0,
    gene2_threshold=3,
    save_dir="lnc_revision/xenium/",
    save_filename="Xenium_spatial_plot.pdf",
    figsize=(15, 10),
    dpi=300
):
    """
    Plot Xenium spatial data with cell boundaries colored by cell type and transcripts for two genes.
    
    Parameters:
    -----------
    xenium_sdata : spatialdata.SpatialData
        Loaded Xenium SpatialData object (user-defined).
    x_region : tuple
        (min, max) x-coordinates for region of interest (user-defined).
    y_region : tuple
        (min, max) y-coordinates for region of interest (user-defined).
    gene1_name : str
        Name of the first gene to plot transcripts for (user-defined).
    gene2_name : str
        Name of the second gene to plot transcripts for (user-defined).
    cell_type_colors : dict
        Dictionary mapping cell types to colors (user-defined).
    gene1_threshold : int, optional
        Minimum transcript count per cell for gene1 (default: 0).
    gene2_threshold : int, optional
        Minimum transcript count per cell for gene2 (default: 3).
    save_dir : str, optional
        Directory to save the plot (default: "lnc_revision/xenium/").
    save_filename : str, optional
        Filename for the saved plot (default: "Xenium_spatial_plot.pdf").
    figsize : tuple, optional
        Figure size (width, height) (default: (15, 10)).
    dpi : int, optional
        DPI for the plot (default: 300).
    
    Returns:
    --------
    None
        Displays and saves the plot, or raises an error if genes are not found.
    
    Raises:
    -------
    ValueError
        If gene1_name or gene2_name is not found in transcripts['feature_name'].
    """
    # Use xenium_sdata.tables['table']
    adata = xenium_sdata.tables['table']
    print("Using xenium_sdata.tables['table'] as adata")
    print("adata shape:", adata.shape)

    # Debug: Check adata basics
    print("adata.obs columns:", adata.obs.columns.tolist())
    print("adata.var_names sample:", adata.var_names[:10].tolist())

    # Check if predicted.id exists
    if 'predicted.id' not in adata.obs.columns:
        print("Warning: predicted.id not found in adata.obs. Using placeholder.")
        adata.obs['predicted.id'] = 'nan'
    else:
        print("adata.obs['predicted.id'] unique values:", adata.obs['predicted.id'].unique())

    # Ensure adata.obs index uses cell_id
    if 'cell_id' in adata.obs.columns:
        adata.obs.index = adata.obs['cell_id'].astype(str)
    else:
        print("Error: cell_id not found in adata.obs.columns")
        raise KeyError("cell_id missing")

    # Convert predicted.id to strings and handle np.nan
    adata.obs['predicted.id'] = adata.obs['predicted.id'].astype(str).replace('nan', np.nan).fillna("nan")

    # Check for missing cell types
    missing_cell_types = [ct for ct in adata.obs['predicted.id'].unique() if ct not in cell_type_colors]
    if missing_cell_types:
        print("Warning: Missing cell types in cell_type_colors:", missing_cell_types)
        for ct in missing_cell_types:
            cell_type_colors[ct] = "grey"  # Add grey as fallback for missing types

    # Create color mapping
    cell_types_to_keep = list(adata.obs["predicted.id"].unique())
    subset_cell_type_colors = {key: cell_type_colors[key] for key in cell_types_to_keep if key in cell_type_colors}
    subset_cell_type_colors = dict(sorted(subset_cell_type_colors.items()))
    cell_type_to_color = subset_cell_type_colors

    # Extract spatial coordinates and filter to region
    if 'spatial' in adata.obsm:
        spatial_coords = adata.obsm['spatial']
        print("Spatial coords range: x", spatial_coords[:, 0].min(), spatial_coords[:, 0].max(),
              "y", spatial_coords[:, 1].min(), spatial_coords[:, 1].max())
        mask = (spatial_coords[:, 0] >= x_region[0]) & (spatial_coords[:, 0] <= x_region[1]) & \
               (spatial_coords[:, 1] >= y_region[0]) & (spatial_coords[:, 1] <= y_region[1])
        adata_subset = adata[mask].copy()
        spatial_coords_subset = adata_subset.obsm['spatial']
    else:
        print("Error: adata.obsm['spatial'] not found.")
        raise KeyError("spatial missing")

    print("adata_subset shape:", adata_subset.shape)
    print("adata_subset.obs index sample:", adata_subset.obs.index[:5].tolist())

    # Extract cell boundaries
    try:
        cell_boundaries = xenium_sdata.shapes['cell_boundaries']
    except KeyError:
        print("Error: Cell boundaries not found in xenium_sdata.shapes['cell_boundaries'].")
        raise

    # Debug: Inspect cell boundaries
    print("cell_boundaries shape:", cell_boundaries.shape)
    print("cell_boundaries index sample:", cell_boundaries.index[:5].tolist())
    print("cell_boundaries geometry types:", cell_boundaries['geometry'].geom_type.value_counts())
    matching_cells = cell_boundaries.index.isin(adata_subset.obs.index)
    print("Number of matching cell IDs:", matching_cells.sum())

    # Filter boundaries
    cell_boundaries_filtered = cell_boundaries.loc[cell_boundaries.index.isin(adata_subset.obs.index)]
    print("Filtered cell_boundaries shape:", cell_boundaries_filtered.shape)

    # Extract transcript data
    try:
        transcripts = xenium_sdata.points['transcripts'].compute()
    except AttributeError:
        transcripts = xenium_sdata.points['transcripts']
    transcripts = transcripts.dropna()

    # Debug: Check transcripts
    print("Transcripts shape:", transcripts.shape)
    print("Transcripts cell_id sample:", transcripts['cell_id'].head().tolist())
    # Search for gene1_name and gene2_name
    print(f"Checking for {gene1_name} and {gene2_name} in feature_name:")
    gene1_matches = transcripts['feature_name'].str.contains(gene1_name, case=False, na=False).sum()
    gene2_matches = transcripts['feature_name'].str.contains(gene2_name, case=False, na=False).sum()
    print(f"{gene1_name} matches: {gene1_matches}")
    print(f"{gene2_name} matches: {gene2_matches}")
    if gene1_matches == 0:
        raise ValueError(f"Gene '{gene1_name}' not found in transcripts['feature_name']. Check available genes.")
    if gene2_matches == 0:
        raise ValueError(f"Gene '{gene2_name}' not found in transcripts['feature_name']. Check available genes.")

    # Filter transcripts to region
    transcripts_cropped = transcripts[
        (transcripts['x'] >= x_region[0]) & (transcripts['x'] <= x_region[1]) &
        (transcripts['y'] >= y_region[0]) & (transcripts['y'] <= y_region[1])
    ]
    print("transcripts_cropped shape:", transcripts_cropped.shape)

    # Aggregate transcripts per cell
    transcripts_per_cell = transcripts_cropped.groupby(['cell_id', 'feature_name']).size().unstack(fill_value=0)
    transcripts_per_cell = transcripts_per_cell.reindex(adata_subset.obs.index, fill_value=0)
    print("transcripts_per_cell shape:", transcripts_per_cell.shape)
    print(f"Max {gene1_name} counts:", transcripts_per_cell.get(gene1_name, 0).max())
    print(f"Max {gene2_name} counts:", transcripts_per_cell.get(gene2_name, 0).max())

    # Filter cells
    gene1_cells = transcripts_per_cell[transcripts_per_cell.get(gene1_name, 0) > gene1_threshold].index
    gene2_cells = transcripts_per_cell[transcripts_per_cell.get(gene2_name, 0) > gene2_threshold].index
    print(f"Cells with {gene1_name} > {gene1_threshold}:", len(gene1_cells))
    print(f"Cells with {gene2_name} > {gene2_threshold}:", len(gene2_cells))

    # Get transcripts
    gene1 = transcripts_cropped[
        (transcripts_cropped['feature_name'] == gene1_name) & 
        (transcripts_cropped['cell_id'].isin(gene1_cells))
    ]
    gene2 = transcripts_cropped[
        (transcripts_cropped['feature_name'] == gene2_name) & 
        (transcripts_cropped['cell_id'].isin(gene2_cells))
    ]
    print(f"{gene1_name} transcripts:", len(gene1))
    print(f"{gene2_name} transcripts:", len(gene2))

    # Set up the plot
    with plt.rc_context({"figure.figsize": (5, 4), "figure.dpi": dpi}):
        fig, ax = plt.subplots(figsize=figsize)

        # Plot boundaries
        boundaries_plotted = False
        if cell_boundaries_filtered.shape[0] > 0:
            for cell_id in adata_subset.obs.index:
                if cell_id in cell_boundaries_filtered.index:
                    try:
                        geom = cell_boundaries_filtered.loc[cell_id, 'geometry']
                        if geom.geom_type == 'Polygon':
                            coords = np.array(geom.exterior.coords)
                        elif geom.geom_type == 'MultiPolygon':
                            coords = np.array(max(geom.geoms, key=lambda x: x.area).exterior.coords)
                        else:
                            continue
                        cell_type = adata_subset.obs.loc[cell_id, 'predicted.id']
                        color = cell_type_to_color.get(cell_type, 'grey')
                        polygon = Polygon(coords, facecolor=color, edgecolor='none', alpha=1)
                        ax.add_patch(polygon)
                        boundaries_plotted = True
                    except Exception as e:
                        print(f"Failed to plot cell {cell_id}: {e}")
                        continue
        else:
            print("No matching boundaries, plotting centroids")

        # Fallback to centroids
        if not boundaries_plotted:
            cell_colors = adata_subset.obs['predicted.id'].map(cell_type_to_color).to_numpy()
            ax.scatter(
                spatial_coords_subset[:, 0], spatial_coords_subset[:, 1],
                c=cell_colors, s=10, alpha=1, label='Cells'
            )

        # Plot gene expression
        if not gene1.empty:
            ax.scatter(
                gene1['x'], gene1['y'],
                color='yellow', marker='o', s=3, zorder=10, label=f'{gene1_name} (>{gene1_threshold} transcripts)'
            )
        if not gene2.empty:
            ax.scatter(
                gene2['x'], gene2['y'],
                color='black', marker='o', s=3, zorder=10, label=f'{gene2_name} (>{gene2_threshold} transcripts)'
            )

        # Set aspect ratio
        ax.set_aspect('equal', adjustable='box')

        # Add legends
        ax.legend(loc='upper right')
        from matplotlib.lines import Line2D
        legend_elements = [
            Line2D([0], [0], marker='s', color='w', label=cell_type,
                   markerfacecolor=color, markersize=10, linestyle='None')
            for cell_type, color in subset_cell_type_colors.items()
        ]
        ax.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5), title="Cell Types")

        # Add title and labels
        ax.set_title("Xenium Spatial Plot: Cell Types and Gene Expression")
        ax.set_xlabel("X Coordinate")
        ax.set_ylabel("Y Coordinate")

        # Adjust layout
        plt.tight_layout()

        # Ensure save directory exists
        os.makedirs(save_dir, exist_ok=True)
        # Save plot
        plt.savefig(os.path.join(save_dir, save_filename), bbox_inches="tight")
        plt.show()
In [3]:
# ----------------------------
# Universal color scheme
# ----------------------------
cell_type_colors_Breast = {
    "Cancer Epithelial Cells": "#FF0000",   # Bright red (cancer)
    "Epithelial Cells": "#FF6347",          # Tomato
    "Myoepithelial Cells": "#FFA07A",       # Light Salmon
    "Fibroblasts": "#228B22",               # Forest Green
    "Endothelial Cells": "#8B4513", #"#DC143C",         # Crimson
    "Perivascular-like (PVL) Cells": "#6A5ACD", # Slate Blue
    "CD4+ T Cells": "#4169E1",              # Royal Blue
    "CD8+ T Cells": "#00CED1",              # Dark Turquoise
    "Regulatory T Cells": "#1E90FF",        # Dodger Blue
    "B Cells": "#DDA0DD",                   # Plum
    "Plasma Cells": "#F1EA9D",              # Pale Yellow
    "NK Cells": "#32CD32",                  # Lime Green
    "MDSCs": "#8B008B",                     # Dark Magenta
    "Dendritic Cells": "#20B2AA",           # Light Sea Green
    "Monocytes": "#FFD700",                 # Gold
    "Macrophages": "#FFDAB9",               # Peach Puff
    "Mast Cells": "#FF69B4",                # Hot Pink
    "nan": "#808080"                        # Grey
}

cell_type_colors_Melanoma = {
    "Melanocytes": "#FF0000",               # Bright red (cancer)
    "KC Basal": "#FF4500",                  # Orange Red
    "KC Granular": "#FF6347",               # Tomato
    "KC Differentiating": "#FFA500",        # Orange
    "KC Cornified": "#FFD700",              # Gold
    "KC Hair": "#DAA520",                   # Goldenrod
    "Sweat gland related": "#98FB98",       # Pale Green
    "Fibroblast": "#228B22",                # Forest Green
    "Endothelial Cell": "#8B4513",          # Crimson
    "Pericytes": "#6A5ACD",                 # Slate Blue
    "T Cell": "#4169E1",                    # Royal Blue
    "NK": "#00CED1",                        # Dark Turquoise
    "B Cell": "#DDA0DD",                    # Plum
    "Plasma Cell": "#F1EA9D",               # Pale Yellow
    "Macrophage": "#FFDAB9",                # Peach Puff
    "DC": "#20B2AA",                        # Light Sea Green
    "LC": "#00FF7F",                        # Spring Green
    "Mast Cell": "#FF69B4",                 # Hot Pink
    "nan": "#808080"                        # Grey
}

cell_type_colors_HN = {
    "Epithelial": "#FF0000",                # Bright red (cancer)
    "CAFs": "#228B22",                      # Forest Green
    "Myofib": "#32CD32",                    # Lime Green
    "Endothelial": "#8B4513",               # Crimson
    "T cells": "#4169E1",                   # Royal Blue
    "NK": "#00CED1",                        # Dark Turquoise
    "B cells": "#DDA0DD",                   # Plum
    "plasma cells": "#F1EA9D",              # Pale Yellow
    "TAMs": "#FFDAB9",                      # Peach Puff (macrophages/TAMs)
    "mature DC": "#20B2AA",                 # Light Sea Green
    "pDC": "#00FF7F",                       # Spring Green
    "Mast cells": "#FF69B4",                # Hot Pink
    "Salivary": "#98FB98",                  # Pale Green
    "nan": "#808080"                        # Grey
}

cell_type_colors_CRC = {
    "Cancer_Epi": "#FF0000",                # Bright red (cancer)
    "Epithelial": "#FF6347",                # Tomato
    "Fibroblast 2": "#32CD32",              # Lime Green
    "Fibroblast 3": "#228B22",              # Forest Green
    "Endothelial/SM/Schwann": "#8B4513",    # Crimson
    "T CD4": "#4169E1",                     # Royal Blue
    "T CD8": "#00CED1",                     # Dark Turquoise
    "CD8+ CXCL13+ HSP+": "#00B7EB",         # Sky Blue
    "B cells": "#DDA0DD",                   # Plum
    "Plasma cells": "#F1EA9D",              # Pale Yellow
    "Myeloid 1(Mono/Granulo)": "#FFD700",   # Gold
    "Myeloid 2 (Macrophages)": "#FFDAB9",   # Peach Puff
    "Myeloid 3 Dendritic": "#20B2AA",       # Light Sea Green
    "Mast cells": "#FF69B4",                # Hot Pink
    "nan": "#808080"                        # Grey
}

cell_type_colors_GBM = {
    "GBM cell": "#FF0000",                  # Bright red (cancer)
    "Neuron": "#4682B4",                    # Steel Blue
    "Astrocyte": "#32CD32",                 # Lime Green
    "Oligodendrocyte": "#DAA520",           # Goldenrod
    "Myeloid": "#FFD700",                   # Gold (microglia/macrophages)
    "T cell": "#4169E1",                    # Royal Blue
    "B cell": "#DDA0DD",                    # Plum (if present)
    "NK cell": "#00CED1",                   # Dark Turquoise (if present)
    "nan": "#808080"                        # Grey
}

Breast¶

In [ ]:
from spatialdata_io import xenium

# Load Xenium spatial data (replace with your actual path to the Xenium zarr or other format)
xenium_zarr_path = "/QRISdata/Q4386/Xenium_20250319__042556__20250319_lncRNA_IO_Cancer/output-XETG00114__0016985__BreastCancer__20250319__042625/"  # Update this path
xenium_sdata = xenium(xenium_zarr_path)
# Load the metadata CSV file (replace with your actual path)
metadata_path = "lnc_revision/xenium/mel/clean-data/BreastCancer_LT_breast_ref_subsetted_ref.csv"  # Update this path
metadata_df = pd.read_csv(metadata_path, index_col=0)

# Inspect the metadata to ensure it has the required columns
print("Metadata columns:", metadata_df.columns)
print("First few rows of metadata:\n", metadata_df.head())

# Inspect the xenium_sdata.table.obs to understand its structure
print("xenium_sdata.table.obs columns:", xenium_sdata.table.obs.columns)
print("First few rows of xenium_sdata.table.obs:\n", xenium_sdata.table.obs.head())

# Assume the metadata CSV has a 'cell_id' column and a 'cell_type' column
# Ensure the 'cell_id' column in metadata matches the index or a column in xenium_sdata.table.obs
# If cell_id is not the index in xenium_sdata.table.obs, identify the correct column (e.g., 'cell_id')
if 'cell_id' in xenium_sdata.table.obs.columns:
    cell_id_col = 'cell_id'
else:
    # If cell_id is the index of xenium_sdata.table.obs
    cell_id_col = xenium_sdata.table.obs.index.name
    xenium_sdata.table.obs[cell_id_col] = xenium_sdata.table.obs.index

# Merge the cell types from the metadata into xenium_sdata.table.obs
# Set the index of metadata_df to the cell_id column for merging
#metadata_df.set_index('cell_id', inplace=True)

# Ensure the cell_id column in xenium_sdata.table.obs is the index for proper joining
xenium_sdata.table.obs.set_index(cell_id_col, inplace=True)

# Join the cell_type column from metadata_df into xenium_sdata.table.obs
xenium_sdata.table.obs = xenium_sdata.table.obs.join(metadata_df['predicted.id'], how='left')

# Reset the index if needed (optional, depending on your downstream analysis)
xenium_sdata.table.obs.reset_index(inplace=True)
In [ ]:
#### Assuming xenium_sdata is already loaded
# xenium_sdata = sd.read_zarr("path_to_your_xenium_sdata.zarr")

cell_type_colors_Breast = {
            "Cancer Epithelial Cells": "#ff4040",
            "CD4+ T Cells": "#41baeb",
            "B Cells": "#fed9b9",
            "Plasma Cells": "#f1ea9d",
            "NK Cells": "#99ca3e",
            "CD8+ T Cells": "#406573",
            "Fibroblasts": "#458b41",
            "Epithelial Cells": "#d8a1a1",
            "Myoepithelial Cells": "#a1d8a1",
            "nan": "grey",
            "Endothelial Cells": "#f8a41e",
            "Perivascular-like (PVL) Cells": "#dca566",
            "Mast Cells": "#7f8133",
            "Macrophages": "#eae71d",
            "Regulatory T Cells": "#bbe5f3",
            "MDSCs": "#8b008b",
            "Dendritic Cells": "#5f9d9e",
            "Monocytes": "#9cc7a1",
        }
In [24]:
plot_xenium_spatial(
     xenium_sdata=xenium_sdata,
     x_region=(0, 5000),
     y_region=(6000, 8500),
     gene1_name='BIRC5',
     gene2_name='cuTAR215705',cell_type_colors=cell_type_colors_Breast,
     save_filename="paper_Xenium_BIRC5_cuTAR215705_ERpos_boundaries_polygons.pdf"
)
Using xenium_sdata.tables['table'] as adata
adata shape: (464673, 480)
adata.obs columns: ['cell_id', 'transcript_counts', 'control_probe_counts', 'genomic_control_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'nucleus_count', 'segmentation_method', 'region', 'z_level', 'cell_labels', 'predicted.id']
adata.var_names sample: ['A2M', 'ACE2', 'ACTA2', 'ACTB', 'ADAM28', 'AIF1', 'AIRE', 'AKT1', 'ALK', 'ANPEP']
adata.obs['predicted.id'] unique values: ['Cancer Epithelial Cells' 'CD4+ T Cells' 'Macrophages' 'Plasma Cells'
 'Epithelial Cells' 'Fibroblasts' 'CD8+ T Cells' 'Endothelial Cells'
 'B Cells' 'Regulatory T Cells' 'NK Cells' 'Myoepithelial Cells'
 'Mast Cells' 'nan' 'Monocytes' 'Dendritic Cells'
 'Perivascular-like (PVL) Cells' 'MDSCs']
Spatial coords range: x 6.468607425689697 8929.0244140625 y 4.90940523147583 9254.44140625
adata_subset shape: (108962, 480)
adata_subset.obs index sample: ['aaakjklh-1', 'aaalfhfd-1', 'aaalfolm-1', 'aaalgdpe-1', 'aaalnebd-1']
cell_boundaries shape: (464673, 1)
cell_boundaries index sample: ['aaaabalk-1', 'aaaaefcj-1', 'aaaaekcn-1', 'aaaaemhb-1', 'aaaaieck-1']
cell_boundaries geometry types: Polygon    464673
Name: count, dtype: int64
Number of matching cell IDs: 108962
Filtered cell_boundaries shape: (108962, 1)
Transcripts shape: (72390350, 13)
Transcripts cell_id sample: ['UNASSIGNED', 'UNASSIGNED', 'UNASSIGNED', 'UNASSIGNED', 'UNASSIGNED']
Checking for BIRC5 and cuTAR215705 in feature_name:
BIRC5 matches: 23354
cuTAR215705 matches: 202753
transcripts_cropped shape: (15395766, 13)
/scratch/temp/16262444/ipykernel_1142355/1397492350.py:168: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  transcripts_per_cell = transcripts_cropped.groupby(['cell_id', 'feature_name']).size().unstack(fill_value=0)
transcripts_per_cell shape: (108962, 540)
Max BIRC5 counts: 6
Max cuTAR215705 counts: 12
Cells with BIRC5 > 0: 2547
Cells with cuTAR215705 > 3: 2165
BIRC5 transcripts: 2924
cuTAR215705 transcripts: 10156
No description has been provided for this image